library('tidyverse')
customers <- read.csv("C:/Users/ruth/Desktop/Olist/olist_customers_dataset.csv")
order.items <- read.csv("C:/Users/ruth/Desktop/Olist/olist_order_items_dataset.csv")
order.payments <- read.csv("C:/Users/ruth/Desktop/Olist/olist_order_payments_dataset.csv")
order.reviews <- read.csv("C:/Users/ruth/Desktop/Olist/olist_order_reviews_dataset.csv")
orders <- read.csv("C:/Users/ruth/Desktop/Olist/olist_orders_dataset.csv")
sellers <- read.csv("C:/Users/ruth/Desktop/Olist/olist_sellers_dataset.csv")
products <- read.csv("C:/Users/ruth/Desktop/Olist/olist_products_dataset.csv")
product.category <- read.csv("C:/Users/ruth/Desktop/Olist/product_category_name_translation.csv")
Changing the name of the product column
colnames(product.category)[1] <- "product_category_name"
head(product.category)
Joining the tables
prod <- products %>% inner_join(product.category)
Joining, by = "product_category_name"
olist <-customers %>%
inner_join(orders) %>%
inner_join(order.items) %>%
inner_join(order.payments) %>%
inner_join(order.reviews) %>%
inner_join(sellers) %>%
inner_join(prod) # %>%
Joining, by = "customer_id"
Joining, by = "order_id"
Joining, by = "order_id"
Joining, by = "order_id"
Joining, by = "seller_id"
Joining, by = "product_id"
# inner_join(geolocation,by=c('customer_city'='geolocation_city'))
dim(olist)
[1] 116581 40
looking for missing values
colSums(is.na(olist))
customer_id customer_unique_id customer_zip_code_prefix customer_city
0 0 0 0
customer_state order_id order_status order_purchase_timestamp
0 0 0 0
order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date
0 0 0 0
order_item_id product_id seller_id shipping_limit_date
0 0 0 0
price freight_value payment_sequential payment_type
0 0 0 0
payment_installments payment_value review_id review_score
0 0 0 0
review_comment_title review_comment_message review_creation_date review_answer_timestamp
0 1 0 0
seller_zip_code_prefix seller_city seller_state product_category_name
0 0 0 0
product_name_lenght product_description_lenght product_photos_qty product_weight_g
0 0 0 1
product_length_cm product_height_cm product_width_cm product_category_name_english
1 1 1 0
Plotting for missing values
library(VIM)
Loading required package: colorspace
Loading required package: grid
VIM is ready to use.
Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
Attaching package: 㤼㸱VIM㤼㸲
The following object is masked from 㤼㸱package:datasets㤼㸲:
sleep
aggr_plot <- aggr(olist, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(olist), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
Variables sorted by number of missings:
Dropping the missing values
olist[rownames(na.omit(olist)),]
# rownames(olist)
Checking for duplicates
anyDuplicated(olist)
[1] 0
Dropping irrelevant columns
olist.copy <- olist
colnames(olist)
[1] "customer_id" "customer_unique_id" "customer_zip_code_prefix"
[4] "customer_city" "customer_state" "order_id"
[7] "order_status" "order_purchase_timestamp" "order_approved_at"
[10] "order_delivered_carrier_date" "order_delivered_customer_date" "order_estimated_delivery_date"
[13] "order_item_id" "product_id" "seller_id"
[16] "shipping_limit_date" "price" "freight_value"
[19] "payment_sequential" "payment_type" "payment_installments"
[22] "payment_value" "review_id" "review_score"
[25] "review_comment_title" "review_comment_message" "review_creation_date"
[28] "review_answer_timestamp" "seller_zip_code_prefix" "seller_city"
[31] "seller_state" "product_category_name" "product_name_lenght"
[34] "product_description_lenght" "product_photos_qty" "product_weight_g"
[37] "product_length_cm" "product_height_cm" "product_width_cm"
[40] "product_category_name_english"
olist <- olist.copy[,-c(1,2,3,5,6,13,14,15,18,25,23,26,28,29,31,40)]
colnames(olist)
[1] "customer_city" "order_status" "order_purchase_timestamp"
[4] "order_approved_at" "order_delivered_carrier_date" "order_delivered_customer_date"
[7] "order_estimated_delivery_date" "shipping_limit_date" "price"
[10] "payment_sequential" "payment_type" "payment_installments"
[13] "payment_value" "review_score" "review_creation_date"
[16] "seller_city" "product_category_name" "product_name_lenght"
[19] "product_description_lenght" "product_photos_qty" "product_weight_g"
[22] "product_length_cm" "product_height_cm" "product_width_cm"
Save to a CSV file.
write.csv(olist,'olist2.csv')
Renaming the columns that are misspelled
colnames(olist)
[1] "customer_city" "order_status" "order_purchase_timestamp"
[4] "order_approved_at" "order_delivered_carrier_date" "order_delivered_customer_date"
[7] "order_estimated_delivery_date" "shipping_limit_date" "price"
[10] "payment_sequential" "payment_type" "payment_installments"
[13] "payment_value" "review_score" "review_creation_date"
[16] "seller_city" "product_category_name" "product_name_length"
[19] "product_description_length" "product_photos_qty" "product_weight_g"
[22] "product_length_cm" "product_height_cm" "product_width_cm"
Checking for outliers in the numeric columns
#get the numeric columns
num.col <- Filter(is.numeric, olist)
for (i in 1:length(num.col)) {
boxplot(num.col[,i], main=names(num.col[i]), type="1")
print(i)
}
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
UNIVARIATE ANALYSIS
#Getting the general overview of the data
summary(olist)
customer_city order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date
Length:116581 Length:116581 Length:116581 Length:116581 Length:116581
Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character
order_delivered_customer_date order_estimated_delivery_date shipping_limit_date price payment_sequential
Length:116581 Length:116581 Length:116581 Min. : 0.85 Min. : 1.000
Class :character Class :character Class :character 1st Qu.: 39.90 1st Qu.: 1.000
Mode :character Mode :character Mode :character Median : 74.90 Median : 1.000
Mean : 120.75 Mean : 1.093
3rd Qu.: 134.90 3rd Qu.: 1.000
Max. :6735.00 Max. :29.000
payment_type payment_installments payment_value review_score review_creation_date seller_city
Length:116581 Min. : 0.00 Min. : 0.0 Min. :1.000 Length:116581 Length:116581
Class :character 1st Qu.: 1.00 1st Qu.: 61.0 1st Qu.:4.000 Class :character Class :character
Mode :character Median : 2.00 Median : 108.2 Median :5.000 Mode :character Mode :character
Mean : 2.95 Mean : 172.9 Mean :4.018
3rd Qu.: 4.00 3rd Qu.: 189.7 3rd Qu.:5.000
Max. :24.00 Max. :13664.1 Max. :5.000
product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm
Length:116581 Min. : 5.00 Min. : 4.0 Min. : 1.000 Min. : 0 Min. : 7.00
Class :character 1st Qu.:42.00 1st Qu.: 346.0 1st Qu.: 1.000 1st Qu.: 300 1st Qu.: 18.00
Mode :character Median :52.00 Median : 600.0 Median : 1.000 Median : 700 Median : 25.00
Mean :48.77 Mean : 785.9 Mean : 2.205 Mean : 2116 Mean : 30.32
3rd Qu.:57.00 3rd Qu.: 983.0 3rd Qu.: 3.000 3rd Qu.: 1800 3rd Qu.: 38.00
Max. :76.00 Max. :3992.0 Max. :20.000 Max. :40425 Max. :105.00
NA's :1 NA's :1
product_height_cm product_width_cm product_category_name_english
Min. : 2.00 Min. : 6.00 Length:116581
1st Qu.: 8.00 1st Qu.: 15.00 Class :character
Median : 13.00 Median : 20.00 Mode :character
Mean : 16.65 Mean : 23.12
3rd Qu.: 20.00 3rd Qu.: 30.00
Max. :105.00 Max. :118.00
NA's :1 NA's :1
desc_stats <- data.frame(
min = apply(num.col, 2, min),
median = apply(num.col, 2, median),
mean_df = apply(num.col, 2, mean),
SD = apply(num.col, 2, sd),
max = apply(num.col, 2, max)
)
desc_stats <- round(desc_stats,1)
head(desc_stats)
library(dplyr)
library(inspectdf)
package 㤼㸱inspectdf㤼㸲 was built under R version 4.0.3
inspect_cat(olist)
Column (2/12): order_status
Column (3/12): order_purchase_timestamp
Column (4/12): order_approved_at
Column (5/12): order_delivered_carrier_date
Column (6/12): order_delivered_customer_date
Column (7/12): order_estimated_delivery_date
Column (8/12): shipping_limit_date
Column (9/12): payment_type
Column (10/12): review_creation_date
Column (11/12): seller_city
Column (12/12): product_category_name
common_pcnt, the percentage of each column occupied by the most common level shown in common.